Loading in Packages
Reading in dataset
nature_data <- read_csv(here("data/training.csv"))
## Rows: 23436 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): text, claim
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Creating Word-Count column - word-count for each claim in the dataset
nature_data <- nature_data %>%
mutate(word_count = str_count(nature_data$text, "\\S+"))
colnames(nature_data)
## [1] "text" "claim" "word_count"
unique(nature_data$claim)
## [1] "5_1" "0_0" "1_1" "2_1" "4_4" "3_3" "4_1" "4_2" "1_7" "3_2" "1_4" "5_2"
## [13] "2_3" "4_5" "1_6" "1_3" "1_2" "3_1"
table(nature_data$claim)
##
## 0_0 1_1 1_2 1_3 1_4 1_6 1_7 2_1 2_3 3_1 3_2 3_3 4_1
## 16302 333 147 229 483 189 427 788 339 207 337 323 328
## 4_2 4_4 4_5 5_1 5_2
## 190 245 182 1373 1014
Removing all claims with the claim label “0_0”
nature_data_clean <- nature_data %>% filter(!claim == "0_0")
Creating a Horizontal Bar Plot displaying all claim labels and count
ggplot(data = nature_data_clean, aes(x = fct_rev(fct_infreq(claim))))+
geom_bar(fill = "steelblue", color = "black") +
theme_minimal() +
coord_flip() +
labs(y = "Number of Claims", x = "Type of Claim", title = "Number of Claims by Type")
Organizing the dataset by Super Claim number rather than specific claim
labels
nature_data_clean <- nature_data_clean %>%
group_by(claim_number = substr(nature_data_clean$claim,1,1)) %>%
select(-c("claim"))
Creating a Horizontal Bar Plot displaying all super-claims and count
ggplot(data = nature_data_clean, aes(x = fct_rev(fct_infreq(claim_number))))+
geom_bar(fill = "steelblue", color = "black") +
theme_minimal() +
coord_flip() +
labs(y = "Number of Claims", x = "Type of Claim", title = "Number of Claims by Type")
Looking at word count distribution by super claim
ggplot(nature_data_clean, aes(x = word_count, fill = claim_number)) +
geom_histogram(bins = 67, color = "black") +
theme_minimal()
Tokenize data
nature_data_clean_tokenized <- nature_data_clean[,-2]%>%
unnest_tokens(word, text)
Counting tokens
nature_data_clean_tokenized <- nature_data_clean_tokenized %>%
count(word) %>%
arrange(desc(n))
Filtering tokens for stopwords
nature_data_clean_tokenized <- nature_data_clean_tokenized %>%
filter(!word %in% stopwords("english"))
Creating a Wordcloud
wordcloud(words = nature_data_clean_tokenized$word, freq = nature_data_clean_tokenized$n, min.freq = 5, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "RdYlBu"))
Creating a Network Plot
nature_data_clean_corpus <- corpus(nature_data_clean$text)
toks <- nature_data_clean_corpus %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat <- fcm(toks, context = "window", tri = FALSE)
feat <- names(topfeatures(fcmat, 30))
fcm_select(fcmat, pattern = feat) %>%
textplot_network(min_freq = 0.5, edge_color = "#E7C100")
Looking at Bigrams
nature_data_clean_claims <- nature_data_clean %>%
select(text)
## Adding missing grouping variables: `claim_number`
nature_data_clean_claims
## # A tibble: 7,134 × 2
## # Groups: claim_number [5]
## claim_number text
## <chr> <chr>
## 1 5 "What do you do if you are a global warming alarmist and real-w…
## 2 1 "Now, I am very interested in the AMO, since it strongly influe…
## 3 2 "There could also be other unknown mechanisms driven by solar c…
## 4 4 "One key problem is the sheer difficulty in building new power …
## 5 1 "I am sure that we can expect to see similar coverage about the…
## 6 3 "According to Donohue and his colleagues, climbing levels of CO…
## 7 1 "Moreover the WBDGE site writes that Arctic sea ice has grown s…
## 8 4 "US energy policy is behind much of the price increase, the Oba…
## 9 5 "First, the computer climate models on which predictions of rap…
## 10 4 "Fourth, experience in Europe shows emissions trading markets a…
## # ℹ 7,124 more rows
ngrams <- nature_data_clean_claims %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngrams <- ngrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
ngrams <- ngrams %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
ngrams <- ngrams %>%
unite(bigram, word1, word2, sep=" ")
ngrams_counts <- ngrams %>%
count(bigram, sort = TRUE)
head(ngrams_counts)
## # A tibble: 6 × 3
## # Groups: claim_number [2]
## claim_number bigram n
## <chr> <chr> <int>
## 1 5 global warming 552
## 2 1 global warming 422
## 3 5 climate change 349
## 4 1 sea level 300
## 5 5 climate models 282
## 6 1 sea ice 246
Looking at Fourgrams
ngrams4 <- nature_data_clean_claims %>%
unnest_tokens(fourgram, text, token = "ngrams", n = 4)
ngrams4 <- ngrams4 %>%
separate(fourgram, c("word1", "word2", "word3", "word4"), sep = " ")
ngrams4 <- ngrams4 %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word) %>%
filter(!word4 %in% stop_words$word)
ngrams4 <- ngrams4 %>%
unite(fourgram, word1, word2, word3, word4, sep=" ")
ngrams4_count <- ngrams4 %>%
count(fourgram, sort = TRUE)
head(ngrams4_count)
## # A tibble: 6 × 3
## # Groups: claim_number [3]
## claim_number fourgram n
## <chr> <chr> <int>
## 1 1 arctic sea ice extent 18
## 2 5 sea surface temperature anomalies 13
## 3 1 antarctic sea ice extent 12
## 4 2 sea surface temperature anomalies 12
## 5 2 sea surface temperature records 12
## 6 1 global warming policy foundation 11
Creating Network Plots for each Super Claim
SUPER CLAIM 1 = Global warming is not happening
nature_claim_1 <- nature_data_clean %>%
filter(claim_number == "1")
nature_claim_1_corpus <- corpus(nature_claim_1)
toks_claim_1 <- nature_claim_1_corpus %>%
tokens(remove_punct = T) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = F)
fcmat_claim_1 <- fcm(toks_claim_1, context = "window", tri = F)
feat_claim_1 <- names(topfeatures(fcmat_claim_1, 30))
network_claim_1 <- fcm_select(fcmat_claim_1, pattern = feat_claim_1) %>%
textplot_network(min_freq = 0.5)
network_claim_1
SUPER CLAIM 2 = Human greenhouse gases are not causing global warming
nature_claim_2 <- nature_data_clean %>%
filter(claim_number == "2")
nature_claim_2_corpus <- corpus(nature_claim_2)
toks_claim_2 <- nature_claim_2_corpus %>%
tokens(remove_punct = T) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = F)
fcmat_claim_2 <- fcm(toks_claim_2, context = "window", tri = F)
feat_claim_2 <- names(topfeatures(fcmat_claim_2, 30))
network_claim_2 <- fcm_select(fcmat_claim_2, pattern = feat_claim_2) %>%
textplot_network(min_freq = 0.5)
network_claim_2
SUPER CLAIM 3 = Climate impacts are not bad
nature_claim_3 <- nature_data_clean %>%
filter(claim_number == "3")
nature_claim_3_corpus <- corpus(nature_claim_3)
toks_claim_3 <- nature_claim_3_corpus %>%
tokens(remove_punct = T) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = F)
fcmat_claim_3 <- fcm(toks_claim_3, context = "window", tri = F)
feat_claim_3 <- names(topfeatures(fcmat_claim_3, 30))
network_claim_3 <- fcm_select(fcmat_claim_3, pattern = feat_claim_3) %>%
textplot_network(min_freq = 0.5)
network_claim_3
SUPER CLAIM 4 = Climate solutions won’t work
nature_claim_4 <- nature_data_clean %>%
filter(claim_number == "4")
nature_claim_4_corpus <- corpus(nature_claim_4)
toks_claim_4 <- nature_claim_4_corpus %>%
tokens(remove_punct = T) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = F)
fcmat_claim_4 <- fcm(toks_claim_4, context = "window", tri = F)
feat_claim_4 <- names(topfeatures(fcmat_claim_4, 30))
network_claim_4 <- fcm_select(fcmat_claim_4, pattern = feat_claim_4) %>%
textplot_network(min_freq = 0.5, edge_color = "#E7C100")
network_claim_4
SUPER CLAIM 5 = Climate movement/science is unreliable
nature_claim_5 <- nature_data_clean %>%
filter(claim_number == "5")
nature_claim_5_corpus <- corpus(nature_claim_5)
toks_claim_5 <- nature_claim_5_corpus %>%
tokens(remove_punct = T) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = F)
fcmat_claim_5 <- fcm(toks_claim_5, context = "window", tri = F)
feat_claim_5 <- names(topfeatures(fcmat_claim_5, 30))
network_claim_5 <- fcm_select(fcmat_claim_5, pattern = feat_claim_5) %>%
textplot_network(min_freq = 0.5)
network_claim_5
GRID ARRANGE - Putting together all network plots for comparison
grid.arrange(network_claim_1, network_claim_2, network_claim_3, network_claim_4, network_claim_5)
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## ggrepel: 3 unlabeled data points (too many overlaps). Consider increasing max.overlaps
Creating a Wordcloud for Super Claim 4
final_wordcloud <- nature_data_clean %>% filter(claim_number == "4") %>%
unnest_tokens(word, text)
final_wordcloud_tokenized <- final_wordcloud %>%
count(word) %>%
arrange(desc(n))
final_wordcloud_tokenized <- final_wordcloud_tokenized %>%
filter(!word %in% stopwords("english"))
wordcloud(words = final_wordcloud_tokenized$word, freq = final_wordcloud_tokenized$n,
min.freq = 5, max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "RdYlBu"))
Reading in dataset - This dataset is a result from running EmoRoBERTa in Google Colab with the nature_data dataset
emoroberta <- read_csv(here("data/climate_change.csv"))
## New names:
## Rows: 7134 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): text, claim, emotion dbl (1): ...1
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Exploring the dataset
table(emoroberta$emotion)
##
## admiration amusement anger annoyance approval
## 65 25 10 64 362
## caring confusion curiosity desire disappointment
## 6 161 467 11 53
## disapproval disgust embarrassment excitement fear
## 346 26 17 34 76
## gratitude joy love nervousness neutral
## 24 14 2 7 3872
## optimism realization relief remorse sadness
## 39 1292 1 2 41
## surprise
## 117
unique(emoroberta$emotion)
## [1] "curiosity" "approval" "neutral" "disapproval"
## [5] "realization" "fear" "admiration" "amusement"
## [9] "surprise" "caring" "gratitude" "annoyance"
## [13] "confusion" "joy" "sadness" "excitement"
## [17] "disappointment" "optimism" "embarrassment" "disgust"
## [21] "desire" "anger" "nervousness" "relief"
## [25] "remorse" "love"
dim(emoroberta)
## [1] 7134 4
str(emoroberta)
## spc_tbl_ [7,134 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ...1 : num [1:7134] 0 2 6 15 16 17 21 22 23 30 ...
## $ text : chr [1:7134] "What do you do if you are a global warming alarmist and real-world temperatures do not warm as much as your cli"| __truncated__ "Now, I am very interested in the AMO, since it strongly influences Atlantic hurricanes, Arctic sea ice, and Gre"| __truncated__ "There could also be other unknown mechanisms driven by solar changes that exaggerate the effect of small variat"| __truncated__ "One key problem is the sheer difficulty in building new power plants in America today. Politically powerful gre"| __truncated__ ...
## $ claim : chr [1:7134] "5_1" "1_1" "2_1" "4_4" ...
## $ emotion: chr [1:7134] "curiosity" "approval" "neutral" "disapproval" ...
## - attr(*, "spec")=
## .. cols(
## .. ...1 = col_double(),
## .. text = col_character(),
## .. claim = col_character(),
## .. emotion = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
colnames(emoroberta)
## [1] "...1" "text" "claim" "emotion"
Cleaning up dataset - Filtering out claims with the “neutral” sentiment, displaying what super-claim belongs to each text
rownames(emoroberta) <- NULL
emoroberta <- emoroberta[,-1]
emoroberta_clean_1 <- emoroberta %>%
group_by(claim_number = substr(emoroberta$claim,1,1)) %>% select(-c("claim")) %>% filter(!emotion == "neutral")
Turning the dataset into a dataframe
emotions <- table(emoroberta_clean_1$emotion)
emotions_data <- data.frame(Emotions = names(emotions), Frequency = as.numeric(emotions))
Creating a horizontal bar plot with sentiment scores
ggplot(emotions_data, aes(x = Frequency, y = reorder(Emotions, Frequency), fill = Emotions)) +
geom_bar(stat = "identity") +
theme(legend.position = "none") +
labs(title = "Sentiment Analysis", x = "Frequency", y = "Emotions")
Creating a dataframe with emotion, claim numbers, and frequency
emotions_data_2 <- table(emoroberta_clean_1$claim_number, emoroberta_clean_1$emotion)
emotions_count <- as.data.frame(emotions_data_2)
names(emotions_count) <- c("Claim_Number", "Emotion", "Frequency")
Creating a bar chart displaying emotions for each super claim and frequency
ggplot(data = emotions_count, aes(x= Frequency, y = Claim_Number)) +
geom_histogram(aes(fill = Emotion), stat = "identity") +
coord_flip() +
labs(x = "Count", y = "Super Claim", title = "Sentiment Analysis")
## Warning in geom_histogram(aes(fill = Emotion), stat = "identity"): Ignoring
## unknown parameters: `binwidth`, `bins`, and `pad`